started: AL26Apr2019
last updated: AL30Aug2019

Summary

Input sequencing data: 1,850 vars x 715 samples ( 519BC = 258UBC + 260CBC and 197NFE)
Input eigenvectors for 715 samples (calculated using 1,634 variants not in LD)

Add eigenvectors to phenotypes, make PCA plots and
calculate 19 PC outliers: outside of mean +/- 3*sd

Start_section

Sys.time()
## [1] "2019-08-30 15:44:57 BST"
rm(list=ls())
graphics.off()

library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s15_add_PCs_exclude_outliers"

opts_knit$set(root.dir = base_folder)

options(stringsAsFactors = F)
options(warnPartialMatchArgs = T, 
        warnPartialMatchAttr = T, 
        warnPartialMatchDollar = T)

#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588 

Read_data

# Threshold for outlier detection
th <- 3 # mean +/- (th*sd)

# Sequencing data (for wecare phenotypes: case/control status)
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s12_check_BRCA1_BRCA2_PALB2"
load(paste(source_folder, "s03_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s15_add_PCs_exclude_outliers"

# Eigenvectors
eigenvectors_file <- paste(base_folder, "ampliseq_nfe_1634_715_100PCs.eigenvec", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")

# Eigenvalues
eigenvalues_file <- paste(base_folder, "ampliseq_nfe_1634_715_100PCs.eigenval", sep="/")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")

# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file)

Check data

# List of objects
ls()
## [1] "base_folder"     "eigenvalues.df"  "eigenvectors.df" "genotypes.mx"    "phenotypes.df"   "th"              "variants.df"
# Dimentions of objects
dim(eigenvectors.df)
## [1] 715 102
dim(eigenvalues.df)
## [1] 100   1
dim(genotypes.mx)
## [1] 1850  715
dim(phenotypes.df)
## [1] 715  24
dim(variants.df)
## [1] 1850   65
# Update eigenvectors
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"long_ids" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
##                    long_ids         PC1          PC2          PC3          PC4
## 100_S8_L007     100_S8_L007 -0.00993811 -0.006220860 -0.000233260  0.000518659
## 101_S528_L008 101_S528_L008 -0.00812998 -0.003457840  0.000624146  0.002445700
## 102_S466_L008 102_S466_L008 -0.00241840  0.004862620 -0.004549510 -0.028571000
## 103_S147_L007 103_S147_L007  0.03403750  0.045008900 -0.019662100  0.025902100
## 104_S325_L008 104_S325_L008 -0.00373370  0.000680981 -0.000333239 -0.003197130

Plot eigenvalues

plot(eigenvalues.df$V1, type="b", 
     xlab="Eigenvector", ylab="Eigenvalue",
     ylim=c(0,15), main="Top 100 eigenvectors")

plot(eigenvalues.df$V1[1:20], type="b", 
     xlab="Eigenvector", ylab="Eigenvalue",
     ylim=c(0,15), main="Top 20 eigenvectors")

rm(eigenvalues.df)

Prepare data for plots

# Add factor column with verbal lables for phenotypes to be used in plot
group <- factor(phenotypes.df$cc, levels = c(0,1,-1), labels = c("UBC","CBC","NFE"))
phenotypes.df <- data.frame(phenotypes.df,group)
table(phenotypes.df$group)
## 
## UBC CBC NFE 
## 258 260 197
# Add column with sample IDs for the plot
sample_id <- phenotypes.df$Sample_num
sample_id[phenotypes.df$group=="NFE"] <- phenotypes.df$long_ids[phenotypes.df$group=="NFE"]
phenotypes.df <- data.frame(phenotypes.df,sample_id)
head(phenotypes.df$sample_id)
## [1] "100" "101" "102" "103" "104" "105"
tail(phenotypes.df$sample_id)
## [1] "NA20819" "NA20821" "NA20822" "NA20826" "NA20828" "NA20832"
# Add 5 top eigenvectors to phenotypes
phenotypes.df <- full_join(phenotypes.df,eigenvectors.df[,1:6], by="long_ids")

# Prepare colour scale for ggplots
myColours <- c("NFE"="green", "UBC"="blue", "CBC"="red")
myColourScale <- scale_colour_manual(values=myColours)

# Clean-up
rm(group, sample_id, eigenvectors.df, myColours)

Calculate thresholds for outliers

pc1_mean <- mean(phenotypes.df$PC1)
pc1_sd <- sd(phenotypes.df$PC1)
pc1_min <- pc1_mean - pc1_sd * th
pc1_max <- pc1_mean + pc1_sd * th

pc2_mean <- mean(phenotypes.df$PC2)
pc2_sd <- sd(phenotypes.df$PC2)
pc2_min <- pc2_mean - pc2_sd * th
pc2_max <- pc2_mean + pc2_sd * th

pc3_mean <- mean(phenotypes.df$PC3)
pc3_sd <- sd(phenotypes.df$PC3)
pc3_min <- pc3_mean - pc3_sd * th
pc3_max <- pc3_mean + pc3_sd * th

pc4_mean <- mean(phenotypes.df$PC4)
pc4_sd <- sd(phenotypes.df$PC4)
pc4_min <- pc4_mean - pc4_sd * th
pc4_max <- pc4_mean + pc4_sd * th

pc5_mean <- mean(phenotypes.df$PC5)
pc5_sd <- sd(phenotypes.df$PC5)
pc5_min <- pc5_mean - pc5_sd * th
pc5_max <- pc5_mean + pc5_sd * th

rm(pc1_mean, pc1_sd, 
   pc2_mean, pc2_sd, 
   pc3_mean, pc3_sd, 
   pc4_mean, pc4_sd, 
   pc5_mean, pc5_sd,
   th)

Mark outliers

# Detect outliers on each PC
pc1_outlier <- phenotypes.df$PC1 < pc1_min | phenotypes.df$PC1 > pc1_max
pc2_outlier <- phenotypes.df$PC2 < pc2_min | phenotypes.df$PC2 > pc2_max
pc3_outlier <- phenotypes.df$PC3 < pc3_min | phenotypes.df$PC3 > pc3_max
pc4_outlier <- phenotypes.df$PC4 < pc4_min | phenotypes.df$PC4 > pc4_max
pc5_outlier <- phenotypes.df$PC5 < pc5_min | phenotypes.df$PC5 > pc5_max

# Detect outliers on any PC
pc_outlier <- pc1_outlier | pc2_outlier | pc3_outlier | pc4_outlier | pc5_outlier

# Check counts of outliers
sum(pc1_outlier)
## [1] 17
sum(pc2_outlier)
## [1] 9
sum(pc3_outlier)
## [1] 6
sum(pc4_outlier)
## [1] 9
sum(pc5_outlier)
## [1] 10
sum(pc_outlier)
## [1] 19
# Add outliers data to phenotypes dataframe
phenotypes.df <- data.frame(phenotypes.df, pc_outlier, pc1_outlier, pc2_outlier,
                            pc3_outlier, pc4_outlier, pc5_outlier)

phenotypes.df %>%
  filter(pc_outlier) %>% 
  select(sample_id, pc_outlier, pc1_outlier, pc2_outlier,
         pc3_outlier, pc4_outlier, pc5_outlier) %>% 
  arrange(as.integer(sample_id))
##    sample_id pc_outlier pc1_outlier pc2_outlier pc3_outlier pc4_outlier pc5_outlier
## 1          3       TRUE        TRUE        TRUE       FALSE        TRUE        TRUE
## 2         16       TRUE        TRUE        TRUE        TRUE        TRUE        TRUE
## 3         23       TRUE       FALSE       FALSE       FALSE       FALSE        TRUE
## 4        141       TRUE        TRUE        TRUE       FALSE       FALSE       FALSE
## 5        148       TRUE        TRUE       FALSE       FALSE        TRUE       FALSE
## 6        246       TRUE        TRUE        TRUE       FALSE       FALSE       FALSE
## 7        273       TRUE        TRUE       FALSE       FALSE        TRUE        TRUE
## 8        275       TRUE        TRUE        TRUE       FALSE        TRUE       FALSE
## 9        277       TRUE        TRUE        TRUE        TRUE        TRUE        TRUE
## 10       311       TRUE        TRUE       FALSE        TRUE       FALSE        TRUE
## 11       323       TRUE        TRUE        TRUE        TRUE        TRUE       FALSE
## 12       329       TRUE        TRUE       FALSE       FALSE       FALSE        TRUE
## 13       352       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 14       355       TRUE       FALSE       FALSE       FALSE       FALSE        TRUE
## 15       366       TRUE        TRUE        TRUE        TRUE        TRUE        TRUE
## 16       368       TRUE        TRUE       FALSE       FALSE        TRUE       FALSE
## 17       375       TRUE        TRUE        TRUE        TRUE       FALSE        TRUE
## 18       403       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 19       408       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
# Clean-up
rm(pc_outlier, pc1_outlier, pc2_outlier, pc3_outlier, pc4_outlier, pc5_outlier)

PC1 vs PC2

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC1,PC2)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC1 vs PC2", x="PC1", y="PC2") +
  theme(legend.title=element_blank()) + # To suppress the legend title
  myColourScale +                       # otherwise it would be "group"
  geom_vline(xintercept=c(pc1_min, pc1_max), linetype="dotted", size=0.2) +
  geom_hline(yintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)

PC2 vs PC3

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC2,PC3)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC2 vs PC3", x="PC2", y="PC3") +
  theme(legend.title=element_blank()) + 
  myColourScale + 
  geom_vline(xintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2) +
  geom_hline(yintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)

PC3 vs PC4

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC3,PC4)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC3 vs PC4", x="PC3", y="PC4") +
  theme(legend.title=element_blank()) + 
  myColourScale + 
  geom_vline(xintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2) +
  geom_hline(yintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)

PC4 vs PC5

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC4,PC5)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC4 vs PC5", x="PC4", y="PC5") +
  theme(legend.title=element_blank()) + 
  myColourScale + 
  geom_vline(xintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2) +
  geom_hline(yintercept=c(pc5_min, pc5_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g, myColourScale, 
   pc1_min, pc1_max, pc2_min, pc2_max, pc3_min, pc3_max, pc4_min, pc4_max, pc5_min, pc5_max)

Save results

save.image(paste(base_folder, "s01_add_PCs_1634_715.RData", sep="/"))

Final_section

ls()
## [1] "base_folder"   "genotypes.mx"  "phenotypes.df" "variants.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.9.0  ggplot2_3.2.0 dplyr_0.8.1   knitr_1.23   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1        later_0.8.0       pillar_1.4.1      compiler_3.5.1    tools_3.5.1       digest_0.6.19     jsonlite_1.6      evaluate_0.14     tibble_2.1.3      gtable_0.3.0      viridisLite_0.3.0 pkgconfig_2.0.2   rlang_0.3.4       shiny_1.3.2       crosstalk_1.0.0   yaml_2.2.0        xfun_0.7          withr_2.1.2       stringr_1.4.0     httr_1.4.0        htmlwidgets_1.3   grid_3.5.1        tidyselect_0.2.5  glue_1.3.1        data.table_1.12.2 R6_2.4.0          rmarkdown_1.13    purrr_0.3.2       tidyr_0.8.3       magrittr_1.5      promises_1.0.1    scales_1.0.0      htmltools_0.3.6   assertthat_0.2.1  xtable_1.8-4      mime_0.7          colorspace_1.4-1  httpuv_1.5.1      labeling_0.3      stringi_1.4.3     lazyeval_0.2.2    munsell_0.5.0     crayon_1.3.4
Sys.time()
## [1] "2019-08-30 15:45:00 BST"